{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "6b53f097-2dfd-4d15-884b-35423cf11061",
   "metadata": {},
   "source": [
    "# Lesson 3: Data leakage and toxicity"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9fc01c2-4b01-4d8b-9047-b9fcfc95a441",
   "metadata": {},
   "source": [
    "## Setup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "801f0db6-a009-4ce0-aa5c-9d270749a172",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4fe518ac-086b-453b-975c-0e93a15031e7",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "pd.set_option('display.max_colwidth', None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c56f405-55be-49b7-833e-88f4420d7a7b",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "import whylogs as why"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4661e21-b428-45ac-9588-baa3a95c2c42",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "import helpers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "170e3918-9d8c-411d-ab95-c60b179f3b79",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "chats = pd.read_csv(\"./chats.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7f8ecb01-e541-45a5-a407-781bcb2cde63",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "chats[10:11]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "95355eb2-5c7d-485c-9619-42b14519bea3",
   "metadata": {},
   "source": [
    "## Data leakage \n",
    "\n",
    "### 1. Detect Patterns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "74425f23-545f-4f6c-a47c-147e97514c5d",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "from langkit import regexes"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "38decf76",
   "metadata": {},
   "source": [
    "**Note**: To view the next visuals, you may have to either hide the left-side menu bar or widen the notebook towards the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1ce990b2-efd3-4f58-b46a-e9e97c1d3b85",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.visualize_langkit_metric(\n",
    "    chats, \n",
    "    \"prompt.has_patterns\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5e53b797-17cf-49e3-af5a-a725ab68f35b",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.visualize_langkit_metric(\n",
    "    chats, \n",
    "    \"response.has_patterns\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1860285a-135f-4cd1-b511-4e0b0715d341",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "helpers.show_langkit_critical_queries(\n",
    "    chats, \n",
    "    \"response.has_patterns\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1125b636-a926-4a22-a0b0-0250d42f615b",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "from whylogs.experimental.core.udf_schema import udf_schema"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "490db6b4-04a7-4b67-b110-d26a31b9b3c1",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "annotated_chats, _ = udf_schema().apply_udfs(chats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ea75e1f4-f232-4434-bd9f-94a78b7bd9cd",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "annotated_chats.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "509a7679-14a8-4393-bf25-d6473a4b0211",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "annotated_chats[(annotated_chats[\"prompt.has_patterns\"].notnull()) |\n",
    "                  (annotated_chats[\"response.has_patterns\"].notnull())]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "147043f0",
   "metadata": {},
   "source": [
    "**Note**: To view the next visual, you may have to either hide the left-side menu bar or widen the notebook towards the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d22c7659-1d7f-4dda-9aa1-a84959777e5a",
   "metadata": {
    "height": 97
   },
   "outputs": [],
   "source": [
    "helpers.evaluate_examples(\n",
    "  annotated_chats[(annotated_chats[\"prompt.has_patterns\"].notnull()) |\n",
    "                  (annotated_chats[\"response.has_patterns\"].notnull())] ,\n",
    "  scope=\"leakage\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e744894e-0f25-48fa-ab2d-a52e5d46c794",
   "metadata": {},
   "source": [
    "### 2. Entity recognition"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e78196ea-0a2c-43b6-9942-0ccc7d45d7a4",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "from span_marker import SpanMarkerModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36eb459a-522c-4f18-8422-f5f1bd896050",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "entity_model = SpanMarkerModel.from_pretrained(\n",
    "    \"tomaarsen/span-marker-bert-tiny-fewnerd-coarse-super\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3c09df55-bdf4-4c7b-9acc-1ded249e8c75",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "entity_model.predict(\n",
    "    \"Write an funny email subject to Bill Gates that\\\n",
    "    describes a confidential product called Modelizer 900.\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c07f0df-eacf-43d0-9832-8f16cd14c012",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "leakage_entities = [\"person\", \"product\",\"organization\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f1a8abf7-da4f-4a4f-b2c8-8ad6e776c43d",
   "metadata": {
    "height": 46
   },
   "outputs": [],
   "source": [
    "from whylogs.experimental.core.udf_schema import register_dataset_udf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0217612e-4cd2-4705-8069-e5bbe1ac05f2",
   "metadata": {
    "height": 216
   },
   "outputs": [],
   "source": [
    "@register_dataset_udf([\"prompt\"],\"prompt.entity_leakage\")\n",
    "def entity_leakage(text):\n",
    "    entity_counts = []\n",
    "    for _, row in text.iterrows():\n",
    "        entity_counts.append(\n",
    "            next((entity[\"label\"] for entity in \\\n",
    "                entity_model.predict(row[\"prompt\"]) if\\\n",
    "                entity[\"label\"] in leakage_entities and \\\n",
    "                entity[\"score\"] > 0.25), None\n",
    "            )\n",
    "        )\n",
    "    return entity_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41b2eb3a-5347-44e9-9d4a-c7580e28a447",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "entity_leakage(chats.head(5))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d2cd1726-2e3b-49b7-9b42-e7d0a9c81541",
   "metadata": {
    "height": 216
   },
   "outputs": [],
   "source": [
    "@register_dataset_udf([\"response\"],\"response.entity_leakage\")\n",
    "def entity_leakage(text):\n",
    "    entity_counts = []\n",
    "    for _, row in text.iterrows():\n",
    "        entity_counts.append(\n",
    "            next((entity[\"label\"] for entity in \\\n",
    "                entity_model.predict(row[\"response\"]) if\\\n",
    "                entity[\"label\"] in leakage_entities and \\\n",
    "                entity[\"score\"] > 0.25), None\n",
    "            )\n",
    "        )\n",
    "    return entity_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a52a8290-70b9-49a0-8861-fe0afb94f982",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "annotated_chats, _ = udf_schema().apply_udfs(chats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "553a0675-4938-4349-9ead-408bc5c0d587",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "helpers.show_langkit_critical_queries(\n",
    "    chats, \n",
    "    \"prompt.entity_leakage\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3745f190-efe8-494a-aabf-f9c3d34686f1",
   "metadata": {
    "height": 114
   },
   "outputs": [],
   "source": [
    "annotated_chats[(annotated_chats[\"prompt.has_patterns\"].notnull()) |\n",
    "                  (annotated_chats[\"response.has_patterns\"].notnull()) | \n",
    "                  (annotated_chats[\"prompt.entity_leakage\"].notnull()) |\n",
    "                  (annotated_chats[\"response.entity_leakage\"].notnull())\n",
    "]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0c23f6f",
   "metadata": {},
   "source": [
    "**Note**: To view the next visual, you may have to either hide the left-side menu bar or widen the notebook towards the right."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "78114d83-dcad-4c37-bed1-9620b4f09111",
   "metadata": {
    "height": 131
   },
   "outputs": [],
   "source": [
    "helpers.evaluate_examples(\n",
    "  annotated_chats[(annotated_chats[\"prompt.has_patterns\"].notnull()) |\n",
    "                  (annotated_chats[\"response.has_patterns\"].notnull()) | \n",
    "                  (annotated_chats[\"prompt.entity_leakage\"].notnull()) |\n",
    "                  (annotated_chats[\"response.entity_leakage\"].notnull())],\n",
    "  scope=\"leakage\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "386c5e97-aa0e-4fa8-bb19-6dc404f01469",
   "metadata": {},
   "source": [
    "## Toxicity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06d50c1c-8fc3-4665-b29c-7475e4164688",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": [
    "from transformers import pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "015b8332-9b82-4b25-ae8e-1de24bebfe48",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "toxigen_hatebert = pipeline(\"text-classification\", \n",
    "                            model=\"tomh/toxigen_hatebert\", \n",
    "                            tokenizer=\"bert-base-cased\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c148d525-fd54-447d-b473-7d4413b9ed91",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "toxigen_hatebert([\"Something non-toxic\",\n",
    "                  \"A benign sentence, despite mentioning women.\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a852cc26-7796-48d9-af80-e8a9a79ce517",
   "metadata": {
    "height": 80
   },
   "outputs": [],
   "source": [
    "@register_dataset_udf([\"prompt\"],\"prompt.implicit_toxicity\")\n",
    "def implicit_toxicity(text):\n",
    "    return [int(result[\"label\"][-1]) for result in \n",
    "            toxigen_hatebert(text[\"prompt\"].to_list())]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b4f5637-30c1-4caf-9a44-17cef0ffb7dd",
   "metadata": {
    "height": 63
   },
   "outputs": [],
   "source": [
    "helpers.show_langkit_critical_queries(\n",
    "    annotated_chats, \n",
    "    \"prompt.implicit_toxicity\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e40f6396-d860-4c6a-be76-edf3e7524d4a",
   "metadata": {
    "height": 29
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
